library(lme4)
library(ggplot2)
library(plotrix)
library(visreg)
library(stringr)
library(tidyverse)
library(ggpubr)
library(gridExtra)
library(tidyverse)
library(tidymodels)
library(vip) 
library(plm)
new_format_lasso_data<- read_csv("/Users/noa/Workspace/lasso_positions_sampling_results/new_spr_lasso.csv")
new_format_lasso_data_edited<-new_format_lasso_data %>% pivot_longer(cols= c("0.01","0.025","0.05","0.1"), values_to="lasso_metrics", names_to = "sample_pct") %>% mutate (test_r2_raw = str_extract(lasso_metrics,'lasso_test_R\\^2.: 0.\\d+'),test_spearman_rho_raw = str_extract(lasso_metrics,'lasso_test_spearmanr.: 0.\\d+'), number_loci_chosen_raw = str_extract(lasso_metrics,'number_loci_chosen.: \\d+')) %>% mutate (sample_pct=  parse_number(sample_pct),test_r2 = parse_number(str_replace(test_r2_raw,"lasso_test_R\\^2.: ","")),test_spearman_rho = parse_number(str_replace(test_spearman_rho_raw,"lasso_test_spearmanr.: ","")),number_loci_chosen = parse_number(str_replace(number_loci_chosen_raw,"number_loci_chosen.: ","")))
new_format_lasso_data_edited2<- new_format_lasso_data_edited%>%
  mutate(actual_sample_pct =case_when(sample_pct>=0.1 ~ 0.1, sample_pct>=0.05 ~ 0.05,sample_pct>=0.025 ~ 0.025, sample_pct>=0.01 ~ 0.01)) %>%  separate(dataset_id, "_supermatrices_",
                into = c("prefix", "last_value"), 
                remove = FALSE) %>% separate(last_value,"_",into = c("dataset_name", "suffix")) %>%  select(-c("prefix","suffix"))
         


new_format_lasso_data_edited2
new_format_lasso_data_edited2 %>% count(actual_training_size, actual_sample_pct, n_seq, n_loci)
new_format_lasso_data_edited2 %>% count(dataset_name,actual_training_size, actual_sample_pct, n_seq, n_loci)
unexplained_variance_data<- new_format_lasso_data_edited2 %>% mutate (unexplained_var =1-test_r2 )

unexplained_variance_data %>% select (n_seq, n_loci,actual_training_size, actual_sample_pct,test_r2, unexplained_var)

unexplained_variance_data %>% group_by(n_seq, n_loci,actual_training_size, actual_sample_pct) %>% summarise(median_unexplained_var = median(unexplained_var))
`summarise()` has grouped output by 'n_seq', 'n_loci', 'actual_training_size'. You can override using the `.groups` argument.
unexplained_variance_data %>% group_by(actual_training_size, actual_sample_pct, n_loci, n_seq) %>%summarize(median_r_2 = median(unexplained_var),std_r_2 = sd(unexplained_var)) %>%
  ggplot(aes(x=actual_sample_pct,y=median_r_2, group = as.factor(actual_training_size), color = as.factor(actual_training_size))) +geom_line(size=1) + geom_point(size=2) + scale_x_continuous(labels = scales::percent,breaks = c(0.01,0.025,0.05,0.1)) +labs(color="Training size",x="Sample percentage",y="Unexplained variance (%)", title="Median percentage of unexplained variance on test sets") + scale_y_continuous(labels = scales::percent, expand = c(0,0)) +theme_classic()+# coord_cartesian(ylim = c(0, NA))+
  geom_errorbar(aes(ymin=max(median_r_2-std_r_2,0), ymax=median_r_2+std_r_2),position=position_dodge(0.005)) + facet_grid(rows = vars(n_seq), cols = vars(n_loci))
`summarise()` has grouped output by 'actual_training_size', 'actual_sample_pct', 'n_loci'. You can override using the `.groups` argument.

#example_msa_data %>% select (actual_sample_pct, lasso_running_time)
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyfQpsaWJyYXJ5KGxtZTQpCmxpYnJhcnkoZ2dwbG90MikKbGlicmFyeShwbG90cml4KQpsaWJyYXJ5KHZpc3JlZykKbGlicmFyeShzdHJpbmdyKQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShnZ3B1YnIpCmxpYnJhcnkoZ3JpZEV4dHJhKQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeSh0aWR5bW9kZWxzKQpsaWJyYXJ5KHZpcCkgCmxpYnJhcnkocGxtKQpgYGAKCgpgYGB7cn0KbmV3X2Zvcm1hdF9sYXNzb19kYXRhPC0gcmVhZF9jc3YoIi9Vc2Vycy9ub2EvV29ya3NwYWNlL2xhc3NvX3Bvc2l0aW9uc19zYW1wbGluZ19yZXN1bHRzL25ld19zcHJfbGFzc28uY3N2IikKYGBgCmBgYHtyfQpuZXdfZm9ybWF0X2xhc3NvX2RhdGFfZWRpdGVkPC1uZXdfZm9ybWF0X2xhc3NvX2RhdGEgJT4lIHBpdm90X2xvbmdlcihjb2xzPSBjKCIwLjAxIiwiMC4wMjUiLCIwLjA1IiwiMC4xIiksIHZhbHVlc190bz0ibGFzc29fbWV0cmljcyIsIG5hbWVzX3RvID0gInNhbXBsZV9wY3QiKSAlPiUgbXV0YXRlICh0ZXN0X3IyX3JhdyA9IHN0cl9leHRyYWN0KGxhc3NvX21ldHJpY3MsJ2xhc3NvX3Rlc3RfUlxcXjIuOiAwLlxcZCsnKSx0ZXN0X3NwZWFybWFuX3Job19yYXcgPSBzdHJfZXh0cmFjdChsYXNzb19tZXRyaWNzLCdsYXNzb190ZXN0X3NwZWFybWFuci46IDAuXFxkKycpLCBudW1iZXJfbG9jaV9jaG9zZW5fcmF3ID0gc3RyX2V4dHJhY3QobGFzc29fbWV0cmljcywnbnVtYmVyX2xvY2lfY2hvc2VuLjogXFxkKycpKSAlPiUgbXV0YXRlIChzYW1wbGVfcGN0PSAgcGFyc2VfbnVtYmVyKHNhbXBsZV9wY3QpLHRlc3RfcjIgPSBwYXJzZV9udW1iZXIoc3RyX3JlcGxhY2UodGVzdF9yMl9yYXcsImxhc3NvX3Rlc3RfUlxcXjIuOiAiLCIiKSksdGVzdF9zcGVhcm1hbl9yaG8gPSBwYXJzZV9udW1iZXIoc3RyX3JlcGxhY2UodGVzdF9zcGVhcm1hbl9yaG9fcmF3LCJsYXNzb190ZXN0X3NwZWFybWFuci46ICIsIiIpKSxudW1iZXJfbG9jaV9jaG9zZW4gPSBwYXJzZV9udW1iZXIoc3RyX3JlcGxhY2UobnVtYmVyX2xvY2lfY2hvc2VuX3JhdywibnVtYmVyX2xvY2lfY2hvc2VuLjogIiwiIikpKQpgYGAKCgpgYGB7cn0KbmV3X2Zvcm1hdF9sYXNzb19kYXRhX2VkaXRlZDI8LSBuZXdfZm9ybWF0X2xhc3NvX2RhdGFfZWRpdGVkJT4lCiAgbXV0YXRlKGFjdHVhbF9zYW1wbGVfcGN0ID1jYXNlX3doZW4oc2FtcGxlX3BjdD49MC4xIH4gMC4xLCBzYW1wbGVfcGN0Pj0wLjA1IH4gMC4wNSxzYW1wbGVfcGN0Pj0wLjAyNSB+IDAuMDI1LCBzYW1wbGVfcGN0Pj0wLjAxIH4gMC4wMSkpICU+JSAgc2VwYXJhdGUoZGF0YXNldF9pZCwgIl9zdXBlcm1hdHJpY2VzXyIsCiAgICAgICAgICAgICAgICBpbnRvID0gYygicHJlZml4IiwgImxhc3RfdmFsdWUiKSwgCiAgICAgICAgICAgICAgICByZW1vdmUgPSBGQUxTRSkgJT4lIHNlcGFyYXRlKGxhc3RfdmFsdWUsIl8iLGludG8gPSBjKCJkYXRhc2V0X25hbWUiLCAic3VmZml4IikpICU+JSAgc2VsZWN0KC1jKCJwcmVmaXgiLCJzdWZmaXgiKSkKICAgICAgICAgCgoKbmV3X2Zvcm1hdF9sYXNzb19kYXRhX2VkaXRlZDIKbmV3X2Zvcm1hdF9sYXNzb19kYXRhX2VkaXRlZDIgJT4lIGNvdW50KGFjdHVhbF90cmFpbmluZ19zaXplLCBhY3R1YWxfc2FtcGxlX3BjdCwgbl9zZXEsIG5fbG9jaSkKYGBgCmBgYHtyfQpuZXdfZm9ybWF0X2xhc3NvX2RhdGFfZWRpdGVkMiAlPiUgY291bnQoZGF0YXNldF9uYW1lLGFjdHVhbF90cmFpbmluZ19zaXplLCBhY3R1YWxfc2FtcGxlX3BjdCwgbl9zZXEsIG5fbG9jaSkKYGBgCgpgYGB7cn0KdW5leHBsYWluZWRfdmFyaWFuY2VfZGF0YTwtIG5ld19mb3JtYXRfbGFzc29fZGF0YV9lZGl0ZWQyICU+JSBtdXRhdGUgKHVuZXhwbGFpbmVkX3ZhciA9MS10ZXN0X3IyICkKCnVuZXhwbGFpbmVkX3ZhcmlhbmNlX2RhdGEgJT4lIHNlbGVjdCAobl9zZXEsIG5fbG9jaSxhY3R1YWxfdHJhaW5pbmdfc2l6ZSwgYWN0dWFsX3NhbXBsZV9wY3QsdGVzdF9yMiwgdW5leHBsYWluZWRfdmFyKQoKdW5leHBsYWluZWRfdmFyaWFuY2VfZGF0YSAlPiUgZ3JvdXBfYnkobl9zZXEsIG5fbG9jaSxhY3R1YWxfdHJhaW5pbmdfc2l6ZSwgYWN0dWFsX3NhbXBsZV9wY3QpICU+JSBzdW1tYXJpc2UobWVkaWFuX3VuZXhwbGFpbmVkX3ZhciA9IG1lZGlhbih1bmV4cGxhaW5lZF92YXIpKQoKdW5leHBsYWluZWRfdmFyaWFuY2VfZGF0YSAlPiUgZ3JvdXBfYnkoYWN0dWFsX3RyYWluaW5nX3NpemUsIGFjdHVhbF9zYW1wbGVfcGN0LCBuX2xvY2ksIG5fc2VxKSAlPiVzdW1tYXJpemUobWVkaWFuX3JfMiA9IG1lZGlhbih1bmV4cGxhaW5lZF92YXIpLHN0ZF9yXzIgPSBzZCh1bmV4cGxhaW5lZF92YXIpKSAlPiUKICBnZ3Bsb3QoYWVzKHg9YWN0dWFsX3NhbXBsZV9wY3QseT1tZWRpYW5fcl8yLCBncm91cCA9IGFzLmZhY3RvcihhY3R1YWxfdHJhaW5pbmdfc2l6ZSksIGNvbG9yID0gYXMuZmFjdG9yKGFjdHVhbF90cmFpbmluZ19zaXplKSkpICtnZW9tX2xpbmUoc2l6ZT0xKSArIGdlb21fcG9pbnQoc2l6ZT0yKSArIHNjYWxlX3hfY29udGludW91cyhsYWJlbHMgPSBzY2FsZXM6OnBlcmNlbnQsYnJlYWtzID0gYygwLjAxLDAuMDI1LDAuMDUsMC4xKSkgK2xhYnMoY29sb3I9IlRyYWluaW5nIHNpemUiLHg9IlNhbXBsZSBwZXJjZW50YWdlIix5PSJVbmV4cGxhaW5lZCB2YXJpYW5jZSAoJSkiLCB0aXRsZT0iTWVkaWFuIHBlcmNlbnRhZ2Ugb2YgdW5leHBsYWluZWQgdmFyaWFuY2Ugb24gdGVzdCBzZXRzIikgKyBzY2FsZV95X2NvbnRpbnVvdXMobGFiZWxzID0gc2NhbGVzOjpwZXJjZW50LCBleHBhbmQgPSBjKDAsMCkpICt0aGVtZV9jbGFzc2ljKCkrIyBjb29yZF9jYXJ0ZXNpYW4oeWxpbSA9IGMoMCwgTkEpKSsKICArIGZhY2V0X2dyaWQocm93cyA9IHZhcnMobl9zZXEpLCBjb2xzID0gdmFycyhuX2xvY2kpKQoKIyBnZW9tX2Vycm9yYmFyKGFlcyh5bWluPW1heChtZWRpYW5fcl8yLXN0ZF9yXzIsMCksIHltYXg9bWVkaWFuX3JfMitzdGRfcl8yKSxwb3NpdGlvbj1wb3NpdGlvbl9kb2RnZSgwLjAwNSkpIAoKI2V4YW1wbGVfbXNhX2RhdGEgJT4lIHNlbGVjdCAoYWN0dWFsX3NhbXBsZV9wY3QsIGxhc3NvX3J1bm5pbmdfdGltZSkKYGBgCgo=